{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyOj68rOFVk/fSt+6/mGaOcl",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/sugarforever/LangChain-Advanced/blob/main/Retrievers/02_Contextual_Compression_Retriever.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xPWZLTgY5E_4",
        "outputId": "c87cf6f1-c676-4faf-8fb7-def934a3f5da"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.0/77.0 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m437.8/437.8 kB\u001b[0m \u001b[31m32.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m70.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m61.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.3/66.3 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m80.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m82.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m55.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m593.7/593.7 kB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m20.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m428.8/428.8 kB\u001b[0m \u001b[31m26.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.1/4.1 MB\u001b[0m \u001b[31m81.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m60.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
          ]
        }
      ],
      "source": [
        "!pip install -q -U langchain openai chromadb tiktoken"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "os.environ['OPENAI_API_KEY'] = \"your valid openai api key\""
      ],
      "metadata": {
        "id": "eVQjKJn-5ZZZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def pretty_print_docs(docs):\n",
        "    print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))"
      ],
      "metadata": {
        "id": "gNt6WEub04Rz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!mkdir data\n",
        "!wget https://raw.githubusercontent.com/jerryjliu/llama_index/main/examples/paul_graham_essay/data/paul_graham_essay.txt -O data/paul_graham_essay.txt"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7PyoqHpu6Ha2",
        "outputId": "940f029e-a555-4518-eb31-436ebfb3ad86"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2023-09-30 21:04:52--  https://raw.githubusercontent.com/jerryjliu/llama_index/main/examples/paul_graham_essay/data/paul_graham_essay.txt\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 75047 (73K) [text/plain]\n",
            "Saving to: ‘data/paul_graham_essay.txt’\n",
            "\n",
            "\r          data/paul   0%[                    ]       0  --.-KB/s               \rdata/paul_graham_es 100%[===================>]  73.29K  --.-KB/s    in 0.03s   \n",
            "\n",
            "2023-09-30 21:04:53 (2.72 MB/s) - ‘data/paul_graham_essay.txt’ saved [75047/75047]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "question = \"Where did Paul Graham study?\""
      ],
      "metadata": {
        "id": "pDr0keyO1qIx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
        "from langchain.embeddings import OpenAIEmbeddings\n",
        "from langchain.document_loaders import WebBaseLoader\n",
        "from langchain.vectorstores import Chroma\n",
        "\n",
        "# Load blog post\n",
        "loader = WebBaseLoader(\"https://raw.githubusercontent.com/jerryjliu/llama_index/main/examples/paul_graham_essay/data/paul_graham_essay.txt\")\n",
        "data = loader.load()\n",
        "\n",
        "# Split\n",
        "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
        "splits = text_splitter.split_documents(data)\n",
        "\n",
        "retriever = Chroma.from_documents(splits, OpenAIEmbeddings(), collection_name=\"example\").as_retriever()\n",
        "\n",
        "docs = retriever.get_relevant_documents(question)\n",
        "pretty_print_docs(docs)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Zlx1PgXj02Sl",
        "outputId": "ef146e52-97b7-4771-fc43-cc479292989a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Document 1:\n",
            "\n",
            "So I said no more essays till Bel was done. But I told few people about Bel while I was working on it. So for years it must have seemed that I was doing nothing, when in fact I was working harder than I'd ever worked on anything. Occasionally after wrestling for hours with some gruesome bug I'd check Twitter or HN and see someone asking \"Does Paul Graham still code?\"\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 2:\n",
            "\n",
            "I applied to 3 grad schools: MIT and Yale, which were renowned for AI at the time, and Harvard, which I'd visited because Rich Draves went there, and was also home to Bill Woods, who'd invented the type of parser I used in my SHRDLU clone. Only Harvard accepted me, so that was where I went.\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 3:\n",
            "\n",
            "That fall I started taking art classes at Harvard. Grad students could take classes in any department, and my advisor, Tom Cheatham, was very easy going. If he even knew about the strange classes I was taking, he never said anything.\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 4:\n",
            "\n",
            "Arc, in a house I bought in Cambridge.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.llms import OpenAI\n",
        "from langchain.retrievers import ContextualCompressionRetriever\n",
        "from langchain.retrievers.document_compressors import LLMChainExtractor\n",
        "\n",
        "llm = OpenAI(temperature=0)\n",
        "compressor = LLMChainExtractor.from_llm(llm)\n",
        "compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)\n",
        "\n",
        "compressed_docs = compression_retriever.get_relevant_documents(question)\n",
        "pretty_print_docs(compressed_docs)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zCvxJLSp1Ewa",
        "outputId": "ed90aea5-a18f-40fb-bd07-a1509f7a3855"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/langchain/chains/llm.py:280: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Document 1:\n",
            "\n",
            "Harvard\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 2:\n",
            "\n",
            "\"Harvard\"\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 3:\n",
            "\n",
            "Arc, Cambridge.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.retrievers.document_compressors import LLMChainFilter\n",
        "\n",
        "_filter = LLMChainFilter.from_llm(llm)\n",
        "compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, base_retriever=retriever)\n",
        "\n",
        "compressed_docs = compression_retriever.get_relevant_documents(question)\n",
        "pretty_print_docs(compressed_docs)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Xa-6AdPu1GwG",
        "outputId": "930bd734-522d-46ae-efea-32c7255ce189"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Document 1:\n",
            "\n",
            "I applied to 3 grad schools: MIT and Yale, which were renowned for AI at the time, and Harvard, which I'd visited because Rich Draves went there, and was also home to Bill Woods, who'd invented the type of parser I used in my SHRDLU clone. Only Harvard accepted me, so that was where I went.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.embeddings import OpenAIEmbeddings\n",
        "from langchain.retrievers.document_compressors import EmbeddingsFilter\n",
        "\n",
        "embeddings = OpenAIEmbeddings()\n",
        "embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.8)\n",
        "compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)\n",
        "\n",
        "compressed_docs = compression_retriever.get_relevant_documents(question)\n",
        "pretty_print_docs(compressed_docs)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "V-3OzgRT1IVP",
        "outputId": "56d5286c-fca2-444d-bb5b-5f28650ce4aa"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Document 1:\n",
            "\n",
            "So I said no more essays till Bel was done. But I told few people about Bel while I was working on it. So for years it must have seemed that I was doing nothing, when in fact I was working harder than I'd ever worked on anything. Occasionally after wrestling for hours with some gruesome bug I'd check Twitter or HN and see someone asking \"Does Paul Graham still code?\"\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 2:\n",
            "\n",
            "I applied to 3 grad schools: MIT and Yale, which were renowned for AI at the time, and Harvard, which I'd visited because Rich Draves went there, and was also home to Bill Woods, who'd invented the type of parser I used in my SHRDLU clone. Only Harvard accepted me, so that was where I went.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from langchain.document_transformers import EmbeddingsRedundantFilter\n",
        "from langchain.retrievers.document_compressors import DocumentCompressorPipeline\n",
        "from langchain.text_splitter import CharacterTextSplitter\n",
        "\n",
        "splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=\". \")\n",
        "redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)\n",
        "relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)\n",
        "pipeline_compressor = DocumentCompressorPipeline(\n",
        "    transformers=[splitter, redundant_filter, relevant_filter]\n",
        ")"
      ],
      "metadata": {
        "id": "JmesmHkY1KRg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)\n",
        "\n",
        "compressed_docs = compression_retriever.get_relevant_documents(question)\n",
        "pretty_print_docs(compressed_docs)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gLT2CB4h1MeS",
        "outputId": "4f2364cc-458e-43cd-857c-62ef1b8e48f2"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Document 1:\n",
            "\n",
            "Occasionally after wrestling for hours with some gruesome bug I'd check Twitter or HN and see someone asking \"Does Paul Graham still code?\"\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 2:\n",
            "\n",
            "I applied to 3 grad schools: MIT and Yale, which were renowned for AI at the time, and Harvard, which I'd visited because Rich Draves went there, and was also home to Bill Woods, who'd invented the type of parser I used in my SHRDLU clone. Only Harvard accepted me, so that was where I went.\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 3:\n",
            "\n",
            "That fall I started taking art classes at Harvard. Grad students could take classes in any department, and my advisor, Tom Cheatham, was very easy going. If he even knew about the strange classes I was taking, he never said anything.\n",
            "----------------------------------------------------------------------------------------------------\n",
            "Document 4:\n",
            "\n",
            "Arc, in a house I bought in Cambridge.\n"
          ]
        }
      ]
    }
  ]
}